import textract
import numpy as np
import scipy
import gensim
import os
import pandas as pd
import re
import math
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
#nltk.download('averaged_perceptron_tagger')
from collections import Counter
from matplotlib import pyplot as plt
from gensim import corpora, models
from itertools import repeat
%matplotlib inline
class FocusGroup:
def __init__(self, filename):
self.raw_text=str(textract.process('Data/FocusGroups/' + filename + ".docx")).replace('b\'', '').replace('\'', '')
self.parent_moderator_discussion=self.raw_text.split('\\n\\n\\n')[0].split('\\n\\n')
self.text_including_parents=np.array([parent_moderator_actual
for parent_moderator_actual in self.parent_moderator_discussion
if not (('Parent'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Moderator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Administrator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Speaker'==re.sub(r" [0-9]:","",parent_moderator_actual)))])
self.talkers_including_parents=np.array([parent_moderator_actual.replace(':', '')
for parent_moderator_actual in self.parent_moderator_discussion
if (('Parent'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Moderator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Administrator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Speaker'==re.sub(r" [0-9]:","",parent_moderator_actual)))])
if len(self.raw_text.split('\\n\\n\\n'))>1:
self.within_moderator_discussion=self.raw_text.split('\\n\\n\\n')[1].split('\\n\\n')
self.text_only_moderators=np.array([parent_moderator_actual
for parent_moderator_actual in self.within_moderator_discussion
if not (('Parent'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Moderator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Administrator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Speaker'==re.sub(r" [0-9]:","",parent_moderator_actual)))])
self.talkers_only_moderators=np.array([parent_moderator_actual.replace(':', '')
for parent_moderator_actual in self.within_moderator_discussion
if (('Parent'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Moderator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Administrator'==re.sub(r" [0-9]:","",parent_moderator_actual)) or
('Speaker'==re.sub(r" [0-9]:","",parent_moderator_actual)))])
self.parent_list=[participant for participant in set(self.talkers_including_parents) if 'Parent' in participant]
self.moderator_list=[participant for participant in set(self.talkers_including_parents) if 'Moderator' in participant]
def get_participant_text(self, participant):
if 'Parent' in participant:
mask=[member==participant for member in self.talkers_including_parents]
return list(self.text_including_parents[mask])
elif 'Moderator' in participant:
mask=[member==participant for member in self.talkers_including_parents]
text_from_parent_discussion=self.text_including_parents[mask]
if len(self.raw_text.split('\\n\\n\\n'))==1:
return list(text_from_parent_discussion)
else:
mask=[member==participant for member in self.talkers_only_moderators]
text_from_moderator_discussion=self.text_only_moderators[mask]
return list(text_from_parent_discussion) + list(text_from_moderator_discussion)
stopwords_list=stopwords.words('english')
custom_stopwords=['go','parent','say','0','yeah','would','okay','start','also','well','u','thank','inaudible','crosstalk','able','hear','actually','hi','oh','definitely','part','anything','sure','anyone','yes','thanks','everything','end','everybody','tand','administrator','whatever','sound','ti','moderator','though','mute','speak','silence','finish','bye','audio']
stopwords_list=stopwords_list+custom_stopwords
remove_stopwords_function=lambda tokenized_text, stopwords: [word for word in tokenized_text if word not in stopwords]
lemmatizer_instance=WordNetLemmatizer()
pos_tags_lemmatize_mapping_dict={'N': 'n', 'V': 'v', 'J': 'a', 'R': 'r'}
def pos_mapping_function(pos_tag, dictionary=pos_tags_lemmatize_mapping_dict):
if pos_tag[0] in ['N', 'V', 'J', 'R']:
return dictionary[pos_tag[0]]
else:
return 'n'
def lemmatizer_function(text, dictionary=pos_tags_lemmatize_mapping_dict, pos_mapping_function=pos_mapping_function,
lemmatizer=lemmatizer_instance):
pos_tags_for_lemmatize=[(word, pos_mapping_function(pos_tag)) for word, pos_tag in nltk.pos_tag(text)]
pos_tags_lemmatized=[lemmatizer_instance.lemmatize(word, pos=pos_tag) for word, pos_tag in pos_tags_for_lemmatize]
return pos_tags_lemmatized
def text_processing_pipeline(text_list,additional_stopwords, min_token_count=1, stopwords_list=stopwords_list,
lemmatizer_function=lemmatizer_function, dictionary=pos_tags_lemmatize_mapping_dict,
pos_mapping_function=pos_mapping_function, lemmatizer=lemmatizer_instance):
stopwords_list=stopwords_list+additional_stopwords
lowercase_text_list=[text.lower() for text in text_list] #Making text lowercase
lowercase_text_list=[re.sub(r"[^a-zA-Z0-9]", " ", text) for text in lowercase_text_list] #Removal of punctuation
lowercase_text_list=[text.split() for text in lowercase_text_list] #Tokenization
filtering_original_text=[text_list[i] for i in range (len(lowercase_text_list)) if len(lowercase_text_list[i])>min_token_count]
lowercase_text_list=[text for text in lowercase_text_list if len(text)>min_token_count] #Keeping text with an at least a pre-defined token count
lowercase_text_list=[remove_stopwords_function(text, stopwords_list) for text in lowercase_text_list] #Removing stopwords
lowercase_text_list=[lemmatizer_function(text) for text in lowercase_text_list] #Lemmatization
lowercase_text_list=[remove_stopwords_function(text, stopwords_list) for text in lowercase_text_list] #Removing stopwords
return lowercase_text_list, filtering_original_text
file_list=['Gaming_Group1', 'Gaming_Group2', 'Gaming_Group3', 'Gaming_Group4',
'LowPIU_Group1', 'LowPIU_Group2', 'LowPIU_Group3',
'Media_Group1', 'Media_Group2', 'Media_Group3', 'Media_Group4',
'Social_Group1', 'Social_Group2', 'Social_Group3', 'Social_Group4']
additional_stopword_counts=list(dict(Counter([re.sub('[0-9]', '', file,) for file in file_list])).values())
Gaming_group_stopwords=['like', 'get', 'school', 'hour', 'day', 'even', 'think', 'thing', 'way', 'know', 'year', 'week', 'really', 'one',
'kid', 'game', 'use', 'time', 'want', 'play', 'much', 'back']
Low_PIU_group_stopwords=['school', 'like', 'time', 'get', 'think', 'kid', 'really',
'thing', '00', 'technology', 'year', 'child', 'back', 'lot',
'even', 'know', 'want', 'old', 'one']
Media_group_stopwords=['like', 'thing', 'get', 'really', 'kid', 'time', 'want',
'school', 'think', 'know', 'one', 'use',
'year', 'much', 'back', 'work', 'person', 'pandemic',
'see', 'lot', 'good', 'little', 'day', 'old']
Social_group_stopwords=['like', 'get', 'think', 'know', 'thing', 'time', 'school',
'really', 'child', 'see', 'want',
'kid', 'one', 'lot', 'even']
additional_stopwords_list=[Gaming_group_stopwords, Low_PIU_group_stopwords, Media_group_stopwords, Social_group_stopwords]
additional_stopwords_list=[[stopword_list]*count for count, stopword_list in zip(additional_stopword_counts, additional_stopwords_list)]
additional_stopwords_list=[stopword for additional_stopword in additional_stopwords_list for stopword in additional_stopword]
all_focusgroup_text=[FocusGroup(focus_group_file) for focus_group_file in file_list]
all_focusgroup_processed_text=[text_processing_pipeline(focus_group.text_including_parents,additional_stopword_list, min_token_count=60) for focus_group, additional_stopword_list in zip(all_focusgroup_text, additional_stopwords_list)]
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from bertopic import BERTopic
def collapse_list_of_strings(list_of_strings):
return ' '.join(list_of_strings)
def identify_additional_stopwords_with_LDA(tokenized_text_list, number_of_topics, stopword_threshold):
common_dictionary = Dictionary(tokenized_text_list)
common_corpus = [common_dictionary.doc2bow(text) for text in tokenized_text_list]
lda = LdaModel(common_corpus, num_topics=number_of_topics, id2word=common_dictionary, alpha='auto', eta='auto')
most_common_words=Counter([term[0] for topic_num in range(number_of_topics)
for term in lda.show_topic(topicid=topic_num)]).most_common()
additional_stopwords=[term[0] for term in most_common_words if term[1]>=stopword_threshold]
return additional_stopwords
#Get the Gaming group data from the list of processed word files:
data=[text for gaming_group_text in all_focusgroup_processed_text[0:4] for text in gaming_group_text[0]]
#Prepare Bag of Words representation for LDA:
common_dictionary = Dictionary(data)
common_corpus = [common_dictionary.doc2bow(text) for text in data]
#Print out Words which were included in at least 3 out of the 5 topics:
print(identify_additional_stopwords_with_LDA(data, 5, 3))
#Fit the LDA model:
lda = LdaModel(common_corpus, num_topics=5, id2word=common_dictionary, alpha='auto', eta='auto')
#Check the distribution of topics:
topic_distribution=[lda.get_document_topics(bow=bow_text)[0][0] for bow_text in common_corpus]
print(Counter(topic_distribution))
#Show the most frequent words with their respective probability:
lda.show_topics()
Gaming_group_data=[collapse_list_of_strings(text) for gaming_group_text in all_focusgroup_processed_text[0:4]
for text in gaming_group_text[0]]
number_of_tokens_per_text=[len(gaming_group_text[0]) for gaming_group_text in all_focusgroup_processed_text[0:4]]
date_list=['4/1/2020', '5/1/2020', '11/1/2020', '4/1/2021']
date_list_long=[list(repeat(date_list[i], number_of_tokens_per_text[i])) for i in range(4)]
date_list_long=[date for date_list in date_list_long for date in date_list]
topic_model=BERTopic(language="english", calculate_probabilities=True, verbose=True, n_gram_range=(1,1))
topics, probs = topic_model.fit_transform(Gaming_group_data)
topics_over_time = topic_model.topics_over_time(Gaming_group_data, topics, date_list_long)
topic_model.visualize_topics_over_time(topics_over_time)
freq = topic_model.get_topic_info(); print(freq.head(10))
topic_model.visualize_barchart(topics=list(range(-1,6)), n_words=10)
#Get the Gaming group data from the list of processed word files:
data=[text for gaming_group_text in all_focusgroup_processed_text[4:7] for text in gaming_group_text[0]]
#Prepare Bag of Words representation for LDA:
common_dictionary = Dictionary(data)
common_corpus = [common_dictionary.doc2bow(text) for text in data]
#Print out Words which were included in at least 3 out of the 5 topics:
print(identify_additional_stopwords_with_LDA(data, 5, 3))
#Fit the LDA model:
lda = LdaModel(common_corpus, num_topics=5, id2word=common_dictionary, alpha='auto', eta='auto')
#Check the distribution of topics:
topic_distribution=[lda.get_document_topics(bow=bow_text)[0][0] for bow_text in common_corpus]
print(Counter(topic_distribution))
#Show the most frequent words with their respective probability:
lda.show_topics()
Gaming_group_data=[collapse_list_of_strings(text) for gaming_group_text in all_focusgroup_processed_text[4:7]
for text in gaming_group_text[0]]
number_of_tokens_per_text=[len(gaming_group_text[0]) for gaming_group_text in all_focusgroup_processed_text[4:7]]
date_list=['4/1/2020', '5/1/2020', '11/1/2020', '4/1/2021']
date_list_long=[list(repeat(date_list[i], number_of_tokens_per_text[i])) for i in range(3)]
date_list_long=[date for date_list in date_list_long for date in date_list]
topic_model=BERTopic(language="english", calculate_probabilities=True, verbose=True, n_gram_range=(1,1))
topics, probs = topic_model.fit_transform(Gaming_group_data)
topics_over_time = topic_model.topics_over_time(Gaming_group_data, topics, date_list_long)
topic_model.visualize_topics_over_time(topics_over_time)
freq = topic_model.get_topic_info(); print(freq.head(10))
topic_model.visualize_barchart(topics=list(range(-1,3)), n_words=10)
#Get the Gaming group data from the list of processed word files:
data=[text for gaming_group_text in all_focusgroup_processed_text[7:11] for text in gaming_group_text[0]]
#Prepare Bag of Words representation for LDA:
common_dictionary = Dictionary(data)
common_corpus = [common_dictionary.doc2bow(text) for text in data]
#Print out Words which were included in at least 3 out of the 5 topics:
print(identify_additional_stopwords_with_LDA(data, 5, 3))
#Fit the LDA model:
lda = LdaModel(common_corpus, num_topics=5, id2word=common_dictionary, alpha='auto', eta='auto')
#Check the distribution of topics:
topic_distribution=[lda.get_document_topics(bow=bow_text)[0][0] for bow_text in common_corpus]
print(Counter(topic_distribution))
#Show the most frequent words with their respective probability:
lda.show_topics()
Gaming_group_data=[collapse_list_of_strings(text) for gaming_group_text in all_focusgroup_processed_text[7:11]
for text in gaming_group_text[0]]
number_of_tokens_per_text=[len(gaming_group_text[0]) for gaming_group_text in all_focusgroup_processed_text[7:11]]
date_list=['4/1/2020', '5/1/2020', '11/1/2020', '4/1/2021']
date_list_long=[list(repeat(date_list[i], number_of_tokens_per_text[i])) for i in range(4)]
date_list_long=[date for date_list in date_list_long for date in date_list]
topic_model=BERTopic(language="english", calculate_probabilities=True, verbose=True, n_gram_range=(1,1))
topics, probs = topic_model.fit_transform(Gaming_group_data)
topics_over_time = topic_model.topics_over_time(Gaming_group_data, topics, date_list_long)
topic_model.visualize_topics_over_time(topics_over_time)
freq = topic_model.get_topic_info(); print(freq.head(10))
topic_model.visualize_barchart(topics=list(range(-1,4)), n_words=10)
#Get the Gaming group data from the list of processed word files:
data=[text for gaming_group_text in all_focusgroup_processed_text[11:15] for text in gaming_group_text[0]]
#Prepare Bag of Words representation for LDA:
common_dictionary = Dictionary(data)
common_corpus = [common_dictionary.doc2bow(text) for text in data]
#Print out Words which were included in at least 3 out of the 5 topics:
print(identify_additional_stopwords_with_LDA(data, 5, 3))
#Fit the LDA model:
lda = LdaModel(common_corpus, num_topics=5, id2word=common_dictionary, alpha='auto', eta='auto')
#Check the distribution of topics:
topic_distribution=[lda.get_document_topics(bow=bow_text)[0][0] for bow_text in common_corpus]
print(Counter(topic_distribution))
#Show the most frequent words with their respective probability:
lda.show_topics()
Gaming_group_data=[collapse_list_of_strings(text) for gaming_group_text in all_focusgroup_processed_text[11:15]
for text in gaming_group_text[0]]
number_of_tokens_per_text=[len(gaming_group_text[0]) for gaming_group_text in all_focusgroup_processed_text[11:15]]
date_list=['4/1/2020', '5/1/2020', '11/1/2020', '4/1/2021']
date_list_long=[list(repeat(date_list[i], number_of_tokens_per_text[i])) for i in range(4)]
date_list_long=[date for date_list in date_list_long for date in date_list]
topic_model=BERTopic(language="english", calculate_probabilities=True, verbose=True, n_gram_range=(1,1))
topics, probs = topic_model.fit_transform(Gaming_group_data)
topics_over_time = topic_model.topics_over_time(Gaming_group_data, topics, date_list_long)
topic_model.visualize_topics_over_time(topics_over_time)
freq = topic_model.get_topic_info(); print(freq.head(10))
topic_model.visualize_barchart(topics=list(range(-1,4)), n_words=10)
Crisis logger is text data from online surveys. Each row has a upload id related to the session created at the time of logging their response to covid question. The KeyBert model is applied on it to extract the key phrases from the 132 participant sessions.
import pandas as pd
root = "Data/"
df = pd.read_csv(root + 'CrisisLogger/crisislogger.csv')
#duplicated ids are joined in one transcription e.g. 436 and 441 are duplicate ids
df = df.groupby(['upload_id'])['transcriptions'].apply(' '.join).reset_index()
df.head()
from keybert import KeyBERT
mydoc = '. '.join([elem for elem in df.transcriptions])
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(mydoc,
keyphrase_ngram_range=(3,3),
use_mmr=True,
diversity=0.2
)
keywords
The main themes gathered are parents working from home, teenagers have been challenging during this time along with challenging work environments.
from wordcloud import WordCloud
corpus=''
for i in range(len(df)):
corpus += "".join(df.iloc[i]['transcriptions'])
from collections import Counter
swords = ['know', 'like', 'get', 'one', 'much', 'also', 'even', 'u', 'lot', 'even', 'one', 'go', 'way', 'day', 'see', 'really']
word_list = corpus.split(' ')
resultwords = [word for word in word_list if word not in swords]
result = ' '.join(resultwords)
new_list = result.split(' ')
Counter = Counter(new_list)
most_occur = Counter.most_common(30)
most_occur
wordcloud = WordCloud(width=400, height=400, background_color='white', min_font_size=6).generate(result)
import matplotlib.pyplot as plt
plt.figure(figsize = (8,8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
df_p1 = pd.read_csv(root + 'ProlificAcademic\\April 2020\\Data\\CRISIS_Parent_April_2020.csv')
df_p2 = pd.read_csv(root + 'ProlificAcademic\\May 2020\\Data\\CRISIS_Parent_May_2020.csv')
df_p3 = pd.read_csv(root + 'ProlificAcademic\\November 2020\\Data\\CRISIS_Parent_November_2020.csv')
df_p4 = pd.read_csv(root + 'ProlificAcademic\\April 2021\\Data\\CRISIS_Parent_April_2021.csv')
df_pos_1 = df_p1[['timestamp1', 'specifypositive']].dropna()
df_pos_2 = df_p2[['timestamp1', 'specifypositive']].dropna()
df_pos_3 = df_p3[['timestamp1', 'specifypositive']].dropna()
df_pos_4 = df_p4[['timestamp1', 'specifypositive']].dropna()
df_combined = pd.concat([df_pos_1, df_pos_2, df_pos_3, df_pos_4])
positive_things = df_combined.specifypositive.to_list()
dates = df_combined['timestamp1'].apply(lambda x: pd.Timestamp(x)).to_list()
vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model,
#min_topic_size=70,
n_gram_range=(1,2),
verbose=True)
topics, _ = topic_model.fit_transform(positive_things)
topic_model.get_topic_info()
topic_model.visualize_topics()
topics_over_time = topic_model.topics_over_time(positive_things, topics, dates, nr_bins=40)
topic_model.visualize_topics_over_time(topics_over_time)
topic_model.get_topic(0)
topic_model.get_topic(3)
The blue line Topic 0 indicates the topic where most of the words relate to the enjoyment of the lock down. From the plot it is seen that in the beginning of the lockdown April 2020, this factor peaked off whereas it gradually decreased as the families became used to of the new routine being at home all the time. Same goes for the topic 3 since excessive family time has been a welcome change in the initial days but till November 2020 this this element is not that frequent. i.e. its importance dropped with time.